NS-Forest workflow¶
In [1]:
import sys
import os
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
#CODE_PATH = "/Users/zhangy71/NSForest" # location of NSForest folder
CODE_PATH = "/home/jovyan/session_data/NSForest" # location of NSForest in cloudos.lifebit.ai
sys.path.insert(0, os.path.abspath(CODE_PATH))
from nsforest import ns, nsforesting, utils, NSFOREST_VERSION
/mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_csv from `anndata` is deprecated. Import anndata.io.read_csv instead. warnings.warn(msg, FutureWarning) /mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_text from `anndata` is deprecated. Import anndata.io.read_text instead. warnings.warn(msg, FutureWarning) /mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_excel from `anndata` is deprecated. Import anndata.io.read_excel instead. warnings.warn(msg, FutureWarning) /mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_mtx from `anndata` is deprecated. Import anndata.io.read_mtx instead. warnings.warn(msg, FutureWarning) /mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_loom from `anndata` is deprecated. Import anndata.io.read_loom instead. warnings.warn(msg, FutureWarning) /mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_hdf from `anndata` is deprecated. Import anndata.io.read_hdf instead. warnings.warn(msg, FutureWarning) /mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_csv from `anndata` is deprecated. Import anndata.io.read_csv instead. warnings.warn(msg, FutureWarning) /mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_excel from `anndata` is deprecated. Import anndata.io.read_excel instead. warnings.warn(msg, FutureWarning) /mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_hdf from `anndata` is deprecated. Import anndata.io.read_hdf instead. warnings.warn(msg, FutureWarning) /mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_loom from `anndata` is deprecated. Import anndata.io.read_loom instead. warnings.warn(msg, FutureWarning) /mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_mtx from `anndata` is deprecated. Import anndata.io.read_mtx instead. warnings.warn(msg, FutureWarning) /mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_text from `anndata` is deprecated. Import anndata.io.read_text instead. warnings.warn(msg, FutureWarning) /mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_umi_tools from `anndata` is deprecated. Import anndata.io.read_umi_tools instead. warnings.warn(msg, FutureWarning)
In [2]:
pd.set_option('display.max_rows', None)
0. Set up¶
In [3]:
## set up
organ = "kidney" #<---
author = "Lake" #<---
year = "2023" #<---
output_folder = "outputs_" + organ + "_" + author + "_" + year + "/" #e.g., "outputs_kidney_Lake_2023/"
cluster_header = "subclass.full" #<---
outputfilename_suffix = cluster_header
outputfilename_prefix = cluster_header
1. Data¶
[need to filter normal cells -- Anne]¶
In [4]:
#data_folder = "/Users/zhangy71/Documents/Kidney-2025/Data/Lake-KPMP-2023/" #<---
# running on cloudos.lifebit.ai
data_folder = "/home/jovyan/session_data/mounted-data-readonly/"
In [5]:
adata = sc.read_h5ad(data_folder + "adata_normal_n3566.h5ad") #<---
In [6]:
adata
Out[6]:
AnnData object with n_obs × n_vars = 3566 × 33826
obs: 'nCount_RNA', 'nFeature_RNA', 'library', 'percent.er', 'percent.mt', 'degen.score', 'aEpi.score', 'aStr.score', 'cyc.score', 'matrisome.score', 'collagen.score', 'glycoprotein.score', 'proteoglycan.score', 'S.Score', 'G2M.Score', 'experiment', 'specimen', 'condition.long', 'condition.l1', 'condition.l2', 'donor_id', 'region.l1', 'region.l2', 'percent.cortex', 'percent.medulla', 'sample_tissue_type', 'id', 'pagoda_k100_infomap_coembed', 'subclass.full', 'subclass.l3', 'subclass.l2', 'subclass.l1', 'state.l2', 'state', 'class', 'structure', 'disease_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'eGFR', 'BMI', 'diabetes_history', 'hypertension', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
uns: 'citation', 'organism', 'organism_ontology_term_id', 'schema_reference', 'schema_version', 'title'
obsm: 'X_umap'
2. Clusters¶
number of clusters¶
In [7]:
## number of clusters
n_clusters = adata.obs[cluster_header].nunique()
n_clusters
Out[7]:
75
dendrogram (run this to automatically create the output folder)¶
In [8]:
## auto-adjust figsize
fig_width = int(n_clusters/5)
fig_height = max([2, int(max([len(z) for z in adata.obs[cluster_header].unique()]) / 30) + 1])
In [9]:
## dendrogram and save svg
ns.pp.dendrogram(adata, cluster_header, figsize = (fig_width, fig_height), tl_kwargs = {'optimal_ordering': True},
save = "svg", output_folder = output_folder, outputfilename_suffix = outputfilename_suffix)
WARNING: You’re trying to run this on 33826 dimensions of `.X`, if you really want this, set `use_rep='X'`.
Falling back to preprocessing with `sc.pp.pca` and default params.
WARNING: saving figure to file outputs_kidney_Lake_2023/dendrogram_subclass.full.svg
cluster sizes¶
In [10]:
## cluster sizes
df_cluster_sizes = pd.DataFrame(adata.obs[cluster_header].value_counts())
df_cluster_sizes
Out[10]:
| count | |
|---|---|
| subclass.full | |
| Adaptive / Maladaptive / Repairing Fibroblast | 50 |
| Adaptive / Maladaptive / Repairing Proximal Tubule Epithelial Cell | 50 |
| Mesangial Cell | 50 |
| Medullary Thick Ascending Limb Cell | 50 |
| Medullary Fibroblast | 50 |
| Mast Cell | 50 |
| Macula Densa Cell | 50 |
| M2 Macrophage | 50 |
| Lymphatic Endothelial Cell | 50 |
| Intercalated Cell Type B | 50 |
| Inner Medullary Collecting Duct Cell | 50 |
| Glomerular Capillary Endothelial Cell | 50 |
| Fibroblast | 50 |
| Distal Convoluted Tubule Cell Type 2 | 50 |
| Distal Convoluted Tubule Cell Type 1 | 50 |
| Descending Vasa Recta Endothelial Cell | 50 |
| Descending Thin Limb Cell Type 3 | 50 |
| Monocyte-derived Cell | 50 |
| Myofibroblast | 50 |
| Natural Killer Cell / Natural Killer T Cell | 50 |
| Podocyte | 50 |
| Vascular Smooth Muscle Cell | 50 |
| Transitional Principal-Intercalated Cell | 50 |
| T Cell | 50 |
| Renin-positive Juxtaglomerular Granular Cell | 50 |
| Proximal Tubule Epithelial Cell Segment 3 | 50 |
| Proximal Tubule Epithelial Cell Segment 1 / Segment 2 | 50 |
| Plasma Cell | 50 |
| Neutrophil | 50 |
| Peritubular Capilary Endothelial Cell | 50 |
| Parietal Epithelial Cell | 50 |
| Papillary Tip Epithelial Cell | 50 |
| Outer Medullary Collecting Duct Principal Cell | 50 |
| Outer Medullary Collecting Duct Intercalated Cell Type A | 50 |
| Non-classical Monocyte | 50 |
| Descending Thin Limb Cell Type 2 | 50 |
| Descending Thin Limb Cell Type 1 | 50 |
| Degenerative Vascular Smooth Muscle Cell | 50 |
| Cycling Proximal Tubule Epithelial Cell | 50 |
| Adaptive / Maladaptive / Repairing Thick Ascending Limb Cell | 50 |
| Afferent / Efferent Arteriole Endothelial Cell | 50 |
| Ascending Thin Limb Cell | 50 |
| Ascending Vasa Recta Endothelial Cell | 50 |
| B Cell | 50 |
| Classical Dendritic Cell | 50 |
| Connecting Tubule Cell | 50 |
| Connecting Tubule Intercalated Cell Type A | 50 |
| Connecting Tubule Principal Cell | 50 |
| Cortical Collecting Duct Intercalated Cell Type A | 50 |
| Cortical Collecting Duct Principal Cell | 50 |
| Cortical Thick Ascending Limb Cell | 50 |
| Cycling Endothelial Cell | 50 |
| Degenerative Proximal Tubule Epithelial Cell | 50 |
| Vascular Smooth Muscle Cell / Pericyte | 50 |
| Degenerative Endothelial Cell | 50 |
| Degenerative Medullary Fibroblast | 50 |
| Degenerative Ascending Thin Limb Cell | 50 |
| Degenerative Connecting Tubule Cell | 50 |
| Degenerative Podocyte | 50 |
| Degenerative Peritubular Capilary Endothelial Cell | 50 |
| Degenerative Cortical Intercalated Cell Type A | 50 |
| Degenerative Cortical Thick Ascending Limb Cell | 50 |
| Degenerative Descending Thin Limb Cell Type 3 | 50 |
| Degenerative Distal Convoluted Tubule Cell | 50 |
| Degenerative Outer Medullary Collecting Duct Principal Cell | 50 |
| Degenerative Medullary Thick Ascending Limb Cell | 50 |
| Degenerative Fibroblast | 50 |
| Degenerative Inner Medullary Collecting Duct Cell | 50 |
| Cycling Mononuclear Phagocyte | 48 |
| Schwann Cell / Neural | 35 |
| Plasmacytoid Dendritic Cell | 31 |
| Cycling Myofibroblast | 18 |
| Cycling Natural Killer Cell / Natural Killer T Cell | 16 |
| Cycling Connecting Tubule Cell | 12 |
| Cycling Distal Convoluted Tubule Cell | 6 |
In [11]:
## save
df_cluster_sizes.to_csv(output_folder + outputfilename_prefix + "_cluster_sizes.csv")
cluster order¶
In [12]:
cluster_order = [x.strip() for x in adata.uns["dendrogram_" + cluster_header]['categories_ordered']]
In [13]:
## save
pd.DataFrame({'cluster_order': cluster_order}).to_csv(output_folder + outputfilename_prefix + "_cluster_order.csv", index=False)
summary statistics of data (normal cells)¶
In [14]:
df_normal = pd.DataFrame({'n_obs': [adata.n_obs], 'n_vars': [adata.n_vars], 'n_clusters': [n_clusters]})
df_normal
Out[14]:
| n_obs | n_vars | n_clusters | |
|---|---|---|---|
| 0 | 3566 | 33826 | 75 |
In [15]:
## save
df_normal.to_csv(output_folder + outputfilename_prefix + "_summary_normal.csv", index=False)
3. NS-Forest¶
prep¶
In [16]:
## make a copy b/c the median step will only keep the positive genes
## keep the original data for plotting
adata_prep = adata.copy()
In [17]:
## get medians
adata_prep = ns.pp.prep_medians(adata_prep, cluster_header)
Calculating medians...
Calculating medians (means) per cluster: 100%|██████████| 75/75 [00:07<00:00, 9.90it/s]
Saving calculated medians as adata.varm.medians_subclass.full --- 7.585983753204346 seconds --- median: 0.0 mean: 0.021279225 std: 0.18309134 Only positive genes selected. 5607 positive genes out of 33826 total genes
In [18]:
## get binary scores
adata_prep = ns.pp.prep_binary_scores(adata_prep, cluster_header)
Calculating binary scores...
Calculating binary scores per cluster: 100%|██████████| 75/75 [01:55<00:00, 1.54s/it]
Saving calculated binary scores as adata.varm.binary_scores_subclass.full --- 115.82460403442383 seconds --- median: 0.0 mean: 0.08331220064118282 std: 0.24597213799454656
In [19]:
## check medians
df_medians = adata_prep.varm['medians_' + cluster_header]
print(df_medians.shape)
df_medians.head()
(5607, 75)
Out[19]:
| Adaptive / Maladaptive / Repairing Fibroblast | Adaptive / Maladaptive / Repairing Proximal Tubule Epithelial Cell | Adaptive / Maladaptive / Repairing Thick Ascending Limb Cell | Afferent / Efferent Arteriole Endothelial Cell | Ascending Thin Limb Cell | Ascending Vasa Recta Endothelial Cell | B Cell | Classical Dendritic Cell | Connecting Tubule Cell | Connecting Tubule Intercalated Cell Type A | ... | Plasmacytoid Dendritic Cell | Podocyte | Proximal Tubule Epithelial Cell Segment 1 / Segment 2 | Proximal Tubule Epithelial Cell Segment 3 | Renin-positive Juxtaglomerular Granular Cell | Schwann Cell / Neural | T Cell | Transitional Principal-Intercalated Cell | Vascular Smooth Muscle Cell | Vascular Smooth Muscle Cell / Pericyte | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ENSG00000175899 | 0.0 | 0.000000 | 0.000000 | 2.1501 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.985069 | 0.0 | 0.000000 | 0.000000 | 0.0 | 1.36513 |
| ENSG00000128274 | 0.0 | 0.000000 | 0.000000 | 0.0000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.00000 |
| ENSG00000103591 | 0.0 | 0.000000 | 0.000000 | 0.0000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.00000 |
| ENSG00000115977 | 0.0 | 1.092519 | 1.096968 | 0.0000 | 1.399926 | 0.0 | 0.0 | 0.0 | 1.269087 | 1.171837 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.595464 | 0.931235 | 0.0 | 0.00000 |
| ENSG00000157426 | 0.0 | 0.000000 | 0.000000 | 0.0000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.00000 |
5 rows × 75 columns
In [20]:
## check binary scores
df_binary_scores = adata_prep.varm['binary_scores_' + cluster_header]
print(df_binary_scores.shape)
df_binary_scores.head()
(5607, 75)
Out[20]:
| Adaptive / Maladaptive / Repairing Fibroblast | Adaptive / Maladaptive / Repairing Proximal Tubule Epithelial Cell | Adaptive / Maladaptive / Repairing Thick Ascending Limb Cell | Afferent / Efferent Arteriole Endothelial Cell | Ascending Thin Limb Cell | Ascending Vasa Recta Endothelial Cell | B Cell | Classical Dendritic Cell | Connecting Tubule Cell | Connecting Tubule Intercalated Cell Type A | ... | Plasmacytoid Dendritic Cell | Podocyte | Proximal Tubule Epithelial Cell Segment 1 / Segment 2 | Proximal Tubule Epithelial Cell Segment 3 | Renin-positive Juxtaglomerular Granular Cell | Schwann Cell / Neural | T Cell | Transitional Principal-Intercalated Cell | Vascular Smooth Muscle Cell | Vascular Smooth Muscle Cell / Pericyte | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ENSG00000175899 | 0.0 | 0.000000 | 0.000000 | 0.936407 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.894096 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.902851 |
| ENSG00000128274 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 |
| ENSG00000103591 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 |
| ENSG00000115977 | 0.0 | 0.629325 | 0.629896 | 0.000000 | 0.674722 | 0.0 | 0.0 | 0.0 | 0.654184 | 0.639728 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.597169 | 0.615731 | 0.0 | 0.000000 |
| ENSG00000157426 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 |
5 rows × 75 columns
In [21]:
## save csv and pkl
df_medians.to_csv(output_folder + outputfilename_prefix + "_medians.csv")
df_medians.to_pickle(output_folder + outputfilename_prefix + "_medians.pkl")
df_binary_scores.to_csv(output_folder + outputfilename_prefix + "_binary_scores.csv")
df_binary_scores.to_pickle(output_folder + outputfilename_prefix + "_binary_scores.pkl")
histograms of non-zero values [TO-DO: nice to have them as functions -- Beverly]¶
In [22]:
non_zero_medians = df_medians[df_medians != 0].stack().values
In [23]:
plt.hist(non_zero_medians, bins=100)
plt.title("Non-zero medians")
plt.savefig(output_folder + "hist_nonzero_medians_" + outputfilename_suffix + ".svg")
plt.show()
In [24]:
non_zero_binary_scores = df_binary_scores[df_binary_scores != 0].stack().values
In [25]:
plt.hist(non_zero_binary_scores, bins=100)
plt.title("Non-zero binary scores")
plt.savefig(output_folder + "hist_nonzero_binary_scores_" + outputfilename_suffix + ".svg")
plt.show()
run NSForest()¶
In [26]:
results = nsforesting.NSForest(adata_prep, cluster_header, save = True, save_supplementary = True,
output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Running NS-Forest version 4.1 Preparing adata... --- 0.027405738830566406 seconds --- Pre-selecting genes based on binary scores... BinaryFirst_high Threshold (mean + 2 * std): 0.5752564766302759 Average number of genes after gene_selection in each cluster: 505.8933333333333 Saving number of genes selected per cluster as... outputs_kidney_Lake_2023/subclass.full_gene_selection.csv Number of clusters to evaluate: 75 1 out of 75: Adaptive / Maladaptive / Repairing Fibroblast Pre-selected 293 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000156218', 'ENSG00000077942', 'ENSG00000185070'] fbeta: 0.561 precision: 0.917 recall: 0.22 2 out of 75: Adaptive / Maladaptive / Repairing Proximal Tubule Epithelial Cell Pre-selected 786 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000135220', 'ENSG00000170579'] fbeta: 0.634 precision: 0.783 recall: 0.36 3 out of 75: Adaptive / Maladaptive / Repairing Thick Ascending Limb Cell Pre-selected 947 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000129682', 'ENSG00000115221', 'ENSG00000129151'] fbeta: 0.616 precision: 0.773 recall: 0.34 4 out of 75: Afferent / Efferent Arteriole Endothelial Cell Pre-selected 397 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000135919', 'ENSG00000131477'] fbeta: 0.761 precision: 0.955 recall: 0.42 5 out of 75: Ascending Thin Limb Cell Pre-selected 820 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000184374', 'ENSG00000275395'] fbeta: 0.616 precision: 0.773 recall: 0.34 6 out of 75: Ascending Vasa Recta Endothelial Cell Pre-selected 299 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000148488', 'ENSG00000283632'] fbeta: 0.731 precision: 0.95 recall: 0.38 7 out of 75: B Cell Pre-selected 44 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000156738'] fbeta: 0.773 precision: 0.833 recall: 0.6 8 out of 75: Classical Dendritic Cell Pre-selected 353 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000131203'] fbeta: 0.857 precision: 0.9 recall: 0.72 9 out of 75: Connecting Tubule Cell Pre-selected 363 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000251504', 'ENSG00000165685', 'ENSG00000226674'] fbeta: 0.448 precision: 0.571 recall: 0.24 10 out of 75: Connecting Tubule Intercalated Cell Type A Pre-selected 644 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000112530', 'ENSG00000145147'] fbeta: 0.325 precision: 0.385 recall: 0.2 11 out of 75: Connecting Tubule Principal Cell Pre-selected 653 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000159167', 'ENSG00000104327', 'ENSG00000204323'] fbeta: 0.556 precision: 0.737 recall: 0.28 12 out of 75: Cortical Collecting Duct Intercalated Cell Type A Pre-selected 376 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000278961', 'ENSG00000185274', 'ENSG00000145147'] fbeta: 0.591 precision: 0.867 recall: 0.26 13 out of 75: Cortical Collecting Duct Principal Cell Pre-selected 971 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000182752', 'ENSG00000184672'] fbeta: 0.563 precision: 0.696 recall: 0.32 14 out of 75: Cortical Thick Ascending Limb Cell Pre-selected 425 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000113946', 'ENSG00000036672'] fbeta: 0.534 precision: 0.594 recall: 0.38 15 out of 75: Cycling Connecting Tubule Cell Pre-selected 1128 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000102870', 'ENSG00000158805'] fbeta: 0.341 precision: 0.375 recall: 0.25 16 out of 75: Cycling Distal Convoluted Tubule Cell Pre-selected 1774 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000058404', 'ENSG00000066279'] fbeta: 0.714 precision: 1.0 recall: 0.333 17 out of 75: Cycling Endothelial Cell Pre-selected 290 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000178878', 'ENSG00000139734'] fbeta: 0.57 precision: 0.812 recall: 0.26 18 out of 75: Cycling Mononuclear Phagocyte Pre-selected 189 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000077420', 'ENSG00000101639', 'ENSG00000185811'] fbeta: 0.375 precision: 0.75 recall: 0.125 19 out of 75: Cycling Myofibroblast Pre-selected 768 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000143476', 'ENSG00000152402'] fbeta: 0.761 precision: 1.0 recall: 0.389 20 out of 75: Cycling Natural Killer Cell / Natural Killer T Cell Pre-selected 723 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000166803', 'ENSG00000183918'] fbeta: 0.893 precision: 1.0 recall: 0.625 21 out of 75: Cycling Proximal Tubule Epithelial Cell Pre-selected 275 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000165304', 'ENSG00000132780'] fbeta: 0.354 precision: 0.364 recall: 0.32 22 out of 75: Degenerative Ascending Thin Limb Cell Pre-selected 646 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000197249', 'ENSG00000140600'] fbeta: 0.593 precision: 0.639 recall: 0.46 23 out of 75: Degenerative Connecting Tubule Cell Pre-selected 644 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000162896', 'ENSG00000145536', 'ENSG00000104327'] fbeta: 0.5 precision: 0.9 recall: 0.18 24 out of 75: Degenerative Cortical Intercalated Cell Type A Pre-selected 908 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000130203', 'ENSG00000151418'] fbeta: 0.718 precision: 0.806 recall: 0.5 25 out of 75: Degenerative Cortical Thick Ascending Limb Cell Pre-selected 150 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000198431', 'ENSG00000124107'] fbeta: 0.636 precision: 0.882 recall: 0.3 26 out of 75: Degenerative Descending Thin Limb Cell Type 3 Pre-selected 1607 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000133475', 'ENSG00000115641'] fbeta: 0.787 precision: 0.875 recall: 0.56 27 out of 75: Degenerative Distal Convoluted Tubule Cell Pre-selected 995 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000121769', 'ENSG00000178343'] fbeta: 0.742 precision: 0.818 recall: 0.54 28 out of 75: Degenerative Endothelial Cell Pre-selected 54 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000142089', 'ENSG00000184831'] fbeta: 0.426 precision: 0.727 recall: 0.16 29 out of 75: Degenerative Fibroblast Pre-selected 216 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000139329', 'ENSG00000142173'] fbeta: 0.812 precision: 0.962 recall: 0.5 30 out of 75: Degenerative Inner Medullary Collecting Duct Cell Pre-selected 1131 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000095303', 'ENSG00000171303', 'ENSG00000258551'] fbeta: 0.567 precision: 0.68 recall: 0.34 31 out of 75: Degenerative Medullary Fibroblast Pre-selected 13 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000085662', 'ENSG00000171345'] fbeta: 0.241 precision: 0.222 recall: 0.36 32 out of 75: Degenerative Medullary Thick Ascending Limb Cell Pre-selected 127 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000169344', 'ENSG00000119715', 'ENSG00000074803'] fbeta: 0.413 precision: 0.422 recall: 0.38 33 out of 75: Degenerative Outer Medullary Collecting Duct Principal Cell Pre-selected 769 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000085117', 'ENSG00000166828', 'ENSG00000085563', 'ENSG00000165272'] fbeta: 0.508 precision: 0.706 recall: 0.24 34 out of 75: Degenerative Peritubular Capilary Endothelial Cell Pre-selected 175 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000102755', 'ENSG00000154217', 'ENSG00000142798'] fbeta: 0.32 precision: 0.32 recall: 0.32 35 out of 75: Degenerative Podocyte Pre-selected 209 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000166033', 'ENSG00000107742'] fbeta: 0.873 precision: 1.0 recall: 0.58 36 out of 75: Degenerative Proximal Tubule Epithelial Cell Pre-selected 240 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000259579', 'ENSG00000164626', 'ENSG00000227258'] fbeta: 0.688 precision: 0.864 recall: 0.38 37 out of 75: Degenerative Vascular Smooth Muscle Cell Pre-selected 536 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000198467', 'ENSG00000198523'] fbeta: 0.86 precision: 0.941 recall: 0.64 38 out of 75: Descending Thin Limb Cell Type 1 Pre-selected 343 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000197301', 'ENSG00000150471'] fbeta: 0.556 precision: 1.0 recall: 0.2 39 out of 75: Descending Thin Limb Cell Type 2 Pre-selected 560 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000235139', 'ENSG00000188883'] fbeta: 0.648 precision: 0.75 recall: 0.42 40 out of 75: Descending Thin Limb Cell Type 3 Pre-selected 745 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000249776', 'ENSG00000145721'] fbeta: 0.546 precision: 0.613 recall: 0.38 41 out of 75: Descending Vasa Recta Endothelial Cell Pre-selected 519 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000141469', 'ENSG00000116833'] fbeta: 0.636 precision: 0.933 recall: 0.28 42 out of 75: Distal Convoluted Tubule Cell Type 1 Pre-selected 889 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000237422', 'ENSG00000119121'] fbeta: 0.675 precision: 0.895 recall: 0.34 43 out of 75: Distal Convoluted Tubule Cell Type 2 Pre-selected 559 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000165973', 'ENSG00000070915', 'ENSG00000182168'] fbeta: 0.549 precision: 0.606 recall: 0.4 44 out of 75: Fibroblast Pre-selected 170 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000140092', 'ENSG00000112936', 'ENSG00000116962'] fbeta: 0.574 precision: 0.778 recall: 0.28 45 out of 75: Glomerular Capillary Endothelial Cell Pre-selected 251 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000167941', 'ENSG00000145708'] fbeta: 0.746 precision: 0.952 recall: 0.4 46 out of 75: Inner Medullary Collecting Duct Cell Pre-selected 1433 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000228624', 'ENSG00000249853', 'ENSG00000223561'] fbeta: 0.682 precision: 0.808 recall: 0.42 47 out of 75: Intercalated Cell Type B Pre-selected 362 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000091137', 'ENSG00000188175'] fbeta: 0.775 precision: 0.957 recall: 0.44 48 out of 75: Lymphatic Endothelial Cell Pre-selected 243 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000138722'] fbeta: 0.891 precision: 0.947 recall: 0.72 49 out of 75: M2 Macrophage Pre-selected 78 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000260314', 'ENSG00000137491'] fbeta: 0.714 precision: 0.947 recall: 0.36 50 out of 75: Macula Densa Cell Pre-selected 672 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000139220', 'ENSG00000089250'] fbeta: 0.656 precision: 0.889 recall: 0.32 51 out of 75: Mast Cell Pre-selected 41 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000185052'] fbeta: 0.891 precision: 1.0 recall: 0.62 52 out of 75: Medullary Fibroblast Pre-selected 344 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000182256', 'ENSG00000259275'] fbeta: 0.704 precision: 0.87 recall: 0.4 53 out of 75: Medullary Thick Ascending Limb Cell Pre-selected 1032 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000159261', 'ENSG00000066230', 'ENSG00000117707'] fbeta: 0.634 precision: 0.81 recall: 0.34 54 out of 75: Mesangial Cell Pre-selected 311 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000188517', 'ENSG00000133816'] fbeta: 0.711 precision: 0.771 recall: 0.54 55 out of 75: Monocyte-derived Cell Pre-selected 77 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000137462', 'ENSG00000136250'] fbeta: 0.369 precision: 0.5 recall: 0.18 56 out of 75: Myofibroblast Pre-selected 371 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000198542', 'ENSG00000249669'] fbeta: 0.636 precision: 0.933 recall: 0.28 57 out of 75: Natural Killer Cell / Natural Killer T Cell Pre-selected 177 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000105374', 'ENSG00000275302'] fbeta: 0.78 precision: 0.853 recall: 0.58 58 out of 75: Neutrophil Pre-selected 54 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000143546'] fbeta: 0.817 precision: 0.868 recall: 0.66 59 out of 75: Non-classical Monocyte Pre-selected 128 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000085265', 'ENSG00000028277'] fbeta: 0.634 precision: 0.81 recall: 0.34 60 out of 75: Outer Medullary Collecting Duct Intercalated Cell Type A Pre-selected 1984 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000253675', 'ENSG00000258881'] fbeta: 0.776 precision: 0.871 recall: 0.54 61 out of 75: Outer Medullary Collecting Duct Principal Cell Pre-selected 1300 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000254789', 'ENSG00000267659'] fbeta: 0.652 precision: 0.818 recall: 0.36 62 out of 75: Papillary Tip Epithelial Cell Pre-selected 642 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000174226', 'ENSG00000142973'] fbeta: 0.784 precision: 1.0 recall: 0.42 63 out of 75: Parietal Epithelial Cell Pre-selected 158 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000112414', 'ENSG00000000971'] fbeta: 0.637 precision: 1.0 recall: 0.26 64 out of 75: Peritubular Capilary Endothelial Cell Pre-selected 112 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000163687', 'ENSG00000102755'] fbeta: 0.584 precision: 0.692 recall: 0.36 65 out of 75: Plasma Cell Pre-selected 86 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000167077', 'ENSG00000183508'] fbeta: 0.784 precision: 1.0 recall: 0.42 66 out of 75: Plasmacytoid Dendritic Cell Pre-selected 199 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000267337'] fbeta: 0.901 precision: 1.0 recall: 0.645 67 out of 75: Podocyte Pre-selected 746 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000139304', 'ENSG00000178038'] fbeta: 0.902 precision: 0.972 recall: 0.7 68 out of 75: Proximal Tubule Epithelial Cell Segment 1 / Segment 2 Pre-selected 102 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000171759', 'ENSG00000171766'] fbeta: 0.702 precision: 1.0 recall: 0.32 69 out of 75: Proximal Tubule Epithelial Cell Segment 3 Pre-selected 245 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000158865', 'ENSG00000156222'] fbeta: 0.682 precision: 1.0 recall: 0.3 70 out of 75: Renin-positive Juxtaglomerular Granular Cell Pre-selected 173 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000143839'] fbeta: 0.954 precision: 0.943 recall: 1.0 71 out of 75: Schwann Cell / Neural Pre-selected 94 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000179915'] fbeta: 0.952 precision: 1.0 recall: 0.8 72 out of 75: T Cell Pre-selected 37 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000168685'] fbeta: 0.647 precision: 0.733 recall: 0.44 73 out of 75: Transitional Principal-Intercalated Cell Pre-selected 1436 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000160951', 'ENSG00000130222', 'ENSG00000259120'] fbeta: 0.714 precision: 0.947 recall: 0.36 74 out of 75: Vascular Smooth Muscle Cell Pre-selected 242 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000122367', 'ENSG00000156113'] fbeta: 0.767 precision: 0.92 recall: 0.46 75 out of 75: Vascular Smooth Muscle Cell / Pericyte Pre-selected 89 genes to feed into Random Forest. NSForest-selected markers: ['ENSG00000113721', 'ENSG00000131711'] fbeta: 0.5 precision: 0.6 recall: 0.3 Saving supplementary table as... outputs_kidney_Lake_2023/subclass.full_supplementary.csv Saving markers table as... outputs_kidney_Lake_2023/subclass.full_markers.csv using median
Calculating medians (means) per cluster: 100%|██████████| 75/75 [00:01<00:00, 59.58it/s]
Saving supplementary table as... outputs_kidney_Lake_2023/subclass.full_markers_onTarget_supp.csv Saving supplementary table as... outputs_kidney_Lake_2023/subclass.full_markers_onTarget.csv Saving final results table as... outputs_kidney_Lake_2023/subclass.full_results.csv Saving final results table as... outputs_kidney_Lake_2023/subclass.full_results.pkl --- 345.1476306915283 seconds ---
In [27]:
results
Out[27]:
| software_version | cluster_header | clusterName | clusterSize | f_score | precision | recall | TN | FP | FN | TP | marker_count | NSForest_markers | binary_genes | onTarget | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4.1 | subclass.full | Adaptive / Maladaptive / Repairing Fibroblast | 50 | 0.561224 | 0.916667 | 0.220000 | 3515 | 1 | 39 | 11 | 3 | [ENSG00000156218, ENSG00000077942, ENSG0000018... | [ENSG00000091986, ENSG00000156218, ENSG0000007... | 0.427534 |
| 1 | 4.1 | subclass.full | Adaptive / Maladaptive / Repairing Proximal Tu... | 50 | 0.633803 | 0.782609 | 0.360000 | 3511 | 5 | 32 | 18 | 2 | [ENSG00000135220, ENSG00000170579] | [ENSG00000135220, ENSG00000146592, ENSG0000017... | 0.706288 |
| 2 | 4.1 | subclass.full | Adaptive / Maladaptive / Repairing Thick Ascen... | 50 | 0.615942 | 0.772727 | 0.340000 | 3511 | 5 | 33 | 17 | 3 | [ENSG00000129682, ENSG00000115221, ENSG0000012... | [ENSG00000152056, ENSG00000129682, ENSG0000011... | 0.273096 |
| 3 | 4.1 | subclass.full | Afferent / Efferent Arteriole Endothelial Cell | 50 | 0.760870 | 0.954545 | 0.420000 | 3515 | 1 | 29 | 21 | 2 | [ENSG00000135919, ENSG00000131477] | [ENSG00000135919, ENSG00000090006, ENSG0000016... | 0.778972 |
| 4 | 4.1 | subclass.full | Ascending Thin Limb Cell | 50 | 0.615942 | 0.772727 | 0.340000 | 3511 | 5 | 33 | 17 | 2 | [ENSG00000184374, ENSG00000275395] | [ENSG00000184374, ENSG00000064787, ENSG0000008... | 0.772847 |
| 5 | 4.1 | subclass.full | Ascending Vasa Recta Endothelial Cell | 50 | 0.730769 | 0.950000 | 0.380000 | 3515 | 1 | 31 | 19 | 2 | [ENSG00000148488, ENSG00000283632] | [ENSG00000249797, ENSG00000148488, ENSG0000015... | 1.000000 |
| 6 | 4.1 | subclass.full | B Cell | 50 | 0.773196 | 0.833333 | 0.600000 | 3510 | 6 | 20 | 30 | 1 | [ENSG00000156738] | [ENSG00000156738, ENSG00000153064, ENSG0000013... | 1.000000 |
| 7 | 4.1 | subclass.full | Classical Dendritic Cell | 50 | 0.857143 | 0.900000 | 0.720000 | 3512 | 4 | 14 | 36 | 1 | [ENSG00000131203] | [ENSG00000131203, ENSG00000140090, ENSG0000023... | 1.000000 |
| 8 | 4.1 | subclass.full | Connecting Tubule Cell | 50 | 0.447761 | 0.571429 | 0.240000 | 3507 | 9 | 38 | 12 | 3 | [ENSG00000251504, ENSG00000165685, ENSG0000022... | [ENSG00000251504, ENSG00000104327, ENSG0000016... | 0.118911 |
| 9 | 4.1 | subclass.full | Connecting Tubule Intercalated Cell Type A | 50 | 0.324675 | 0.384615 | 0.200000 | 3500 | 16 | 40 | 10 | 2 | [ENSG00000112530, ENSG00000145147] | [ENSG00000185274, ENSG00000154678, ENSG0000011... | 0.240039 |
| 10 | 4.1 | subclass.full | Connecting Tubule Principal Cell | 50 | 0.555556 | 0.736842 | 0.280000 | 3511 | 5 | 36 | 14 | 3 | [ENSG00000159167, ENSG00000104327, ENSG0000020... | [ENSG00000006327, ENSG00000159167, ENSG0000010... | 0.408436 |
| 11 | 4.1 | subclass.full | Cortical Collecting Duct Intercalated Cell Type A | 50 | 0.590909 | 0.866667 | 0.260000 | 3514 | 2 | 37 | 13 | 3 | [ENSG00000278961, ENSG00000185274, ENSG0000014... | [ENSG00000278961, ENSG00000144227, ENSG0000018... | 0.373059 |
| 12 | 4.1 | subclass.full | Cortical Collecting Duct Principal Cell | 50 | 0.563380 | 0.695652 | 0.320000 | 3509 | 7 | 34 | 16 | 2 | [ENSG00000182752, ENSG00000184672] | [ENSG00000169071, ENSG00000182752, ENSG0000025... | 0.319824 |
| 13 | 4.1 | subclass.full | Cortical Thick Ascending Limb Cell | 50 | 0.533708 | 0.593750 | 0.380000 | 3503 | 13 | 31 | 19 | 2 | [ENSG00000113946, ENSG00000036672] | [ENSG00000169347, ENSG00000113946, ENSG0000003... | 1.000000 |
| 14 | 4.1 | subclass.full | Cycling Connecting Tubule Cell | 12 | 0.340909 | 0.375000 | 0.250000 | 3549 | 5 | 9 | 3 | 2 | [ENSG00000102870, ENSG00000158805] | [ENSG00000102870, ENSG00000138587, ENSG0000015... | 1.000000 |
| 15 | 4.1 | subclass.full | Cycling Distal Convoluted Tubule Cell | 6 | 0.714286 | 1.000000 | 0.333333 | 3560 | 0 | 4 | 2 | 2 | [ENSG00000058404, ENSG00000066279] | [ENSG00000105928, ENSG00000058404, ENSG0000012... | 1.000000 |
| 16 | 4.1 | subclass.full | Cycling Endothelial Cell | 50 | 0.570175 | 0.812500 | 0.260000 | 3513 | 3 | 37 | 13 | 2 | [ENSG00000178878, ENSG00000139734] | [ENSG00000106462, ENSG00000178878, ENSG0000011... | 0.711106 |
| 17 | 4.1 | subclass.full | Cycling Mononuclear Phagocyte | 48 | 0.375000 | 0.750000 | 0.125000 | 3516 | 2 | 42 | 6 | 3 | [ENSG00000077420, ENSG00000101639, ENSG0000018... | [ENSG00000170017, ENSG00000077420, ENSG0000014... | 0.242716 |
| 18 | 4.1 | subclass.full | Cycling Myofibroblast | 18 | 0.760870 | 1.000000 | 0.388889 | 3548 | 0 | 11 | 7 | 2 | [ENSG00000143476, ENSG00000152402] | [ENSG00000136492, ENSG00000123219, ENSG0000011... | 0.779364 |
| 19 | 4.1 | subclass.full | Cycling Natural Killer Cell / Natural Killer T... | 16 | 0.892857 | 1.000000 | 0.625000 | 3550 | 0 | 6 | 10 | 2 | [ENSG00000166803, ENSG00000183918] | [ENSG00000166803, ENSG00000164104, ENSG0000018... | 1.000000 |
| 20 | 4.1 | subclass.full | Cycling Proximal Tubule Epithelial Cell | 50 | 0.353982 | 0.363636 | 0.320000 | 3488 | 28 | 34 | 16 | 2 | [ENSG00000165304, ENSG00000132780] | [ENSG00000165304, ENSG00000132780, ENSG0000012... | 0.191920 |
| 21 | 4.1 | subclass.full | Degenerative Ascending Thin Limb Cell | 50 | 0.592784 | 0.638889 | 0.460000 | 3503 | 13 | 27 | 23 | 2 | [ENSG00000197249, ENSG00000140600] | [ENSG00000197249, ENSG00000275395, ENSG0000014... | 0.437558 |
| 22 | 4.1 | subclass.full | Degenerative Connecting Tubule Cell | 50 | 0.500000 | 0.900000 | 0.180000 | 3515 | 1 | 41 | 9 | 3 | [ENSG00000162896, ENSG00000145536, ENSG0000010... | [ENSG00000162896, ENSG00000145536, ENSG0000010... | 0.285822 |
| 23 | 4.1 | subclass.full | Degenerative Cortical Intercalated Cell Type A | 50 | 0.718391 | 0.806452 | 0.500000 | 3510 | 6 | 25 | 25 | 2 | [ENSG00000130203, ENSG00000151418] | [ENSG00000213185, ENSG00000152931, ENSG0000013... | 0.437250 |
| 24 | 4.1 | subclass.full | Degenerative Cortical Thick Ascending Limb Cell | 50 | 0.635593 | 0.882353 | 0.300000 | 3514 | 2 | 35 | 15 | 2 | [ENSG00000198431, ENSG00000124107] | [ENSG00000203907, ENSG00000135931, ENSG0000019... | 0.235051 |
| 25 | 4.1 | subclass.full | Degenerative Descending Thin Limb Cell Type 3 | 50 | 0.786517 | 0.875000 | 0.560000 | 3512 | 4 | 22 | 28 | 2 | [ENSG00000133475, ENSG00000115641] | [ENSG00000019186, ENSG00000187957, ENSG0000013... | 0.940297 |
| 26 | 4.1 | subclass.full | Degenerative Distal Convoluted Tubule Cell | 50 | 0.741758 | 0.818182 | 0.540000 | 3510 | 6 | 23 | 27 | 2 | [ENSG00000121769, ENSG00000178343] | [ENSG00000166426, ENSG00000121769, ENSG0000018... | 1.000000 |
| 27 | 4.1 | subclass.full | Degenerative Endothelial Cell | 50 | 0.425532 | 0.727273 | 0.160000 | 3513 | 3 | 42 | 8 | 2 | [ENSG00000142089, ENSG00000184831] | [ENSG00000142089, ENSG00000130300, ENSG0000018... | 0.212317 |
| 28 | 4.1 | subclass.full | Degenerative Fibroblast | 50 | 0.811688 | 0.961538 | 0.500000 | 3515 | 1 | 25 | 25 | 2 | [ENSG00000139329, ENSG00000142173] | [ENSG00000139329, ENSG00000159403, ENSG0000010... | 0.930817 |
| 29 | 4.1 | subclass.full | Degenerative Inner Medullary Collecting Duct Cell | 50 | 0.566667 | 0.680000 | 0.340000 | 3508 | 8 | 33 | 17 | 3 | [ENSG00000095303, ENSG00000171303, ENSG0000025... | [ENSG00000095303, ENSG00000171303, ENSG0000022... | 0.629699 |
| 30 | 4.1 | subclass.full | Degenerative Medullary Fibroblast | 50 | 0.240642 | 0.222222 | 0.360000 | 3453 | 63 | 32 | 18 | 2 | [ENSG00000085662, ENSG00000171345] | [ENSG00000109846, ENSG00000085662, ENSG0000012... | 0.118473 |
| 31 | 4.1 | subclass.full | Degenerative Medullary Thick Ascending Limb Cell | 50 | 0.413043 | 0.422222 | 0.380000 | 3490 | 26 | 31 | 19 | 3 | [ENSG00000169344, ENSG00000119715, ENSG0000007... | [ENSG00000169344, ENSG00000119715, ENSG0000007... | 0.163110 |
| 32 | 4.1 | subclass.full | Degenerative Outer Medullary Collecting Duct P... | 50 | 0.508475 | 0.705882 | 0.240000 | 3511 | 5 | 38 | 12 | 4 | [ENSG00000085117, ENSG00000166828, ENSG0000008... | [ENSG00000160951, ENSG00000159167, ENSG0000008... | 0.229297 |
| 33 | 4.1 | subclass.full | Degenerative Peritubular Capilary Endothelial ... | 50 | 0.320000 | 0.320000 | 0.320000 | 3482 | 34 | 34 | 16 | 3 | [ENSG00000102755, ENSG00000154217, ENSG0000014... | [ENSG00000102755, ENSG00000127329, ENSG0000026... | 0.111280 |
| 34 | 4.1 | subclass.full | Degenerative Podocyte | 50 | 0.873494 | 1.000000 | 0.580000 | 3516 | 0 | 21 | 29 | 2 | [ENSG00000166033, ENSG00000107742] | [ENSG00000159713, ENSG00000166033, ENSG0000010... | 0.685713 |
| 35 | 4.1 | subclass.full | Degenerative Proximal Tubule Epithelial Cell | 50 | 0.688406 | 0.863636 | 0.380000 | 3513 | 3 | 31 | 19 | 3 | [ENSG00000259579, ENSG00000164626, ENSG0000022... | [ENSG00000150275, ENSG00000259579, ENSG0000025... | 1.000000 |
| 36 | 4.1 | subclass.full | Degenerative Vascular Smooth Muscle Cell | 50 | 0.860215 | 0.941176 | 0.640000 | 3514 | 2 | 18 | 32 | 2 | [ENSG00000198467, ENSG00000198523] | [ENSG00000198467, ENSG00000198523, ENSG0000017... | 1.000000 |
| 37 | 4.1 | subclass.full | Descending Thin Limb Cell Type 1 | 50 | 0.555556 | 1.000000 | 0.200000 | 3516 | 0 | 40 | 10 | 2 | [ENSG00000197301, ENSG00000150471] | [ENSG00000197301, ENSG00000228412, ENSG0000011... | 0.645970 |
| 38 | 4.1 | subclass.full | Descending Thin Limb Cell Type 2 | 50 | 0.648148 | 0.750000 | 0.420000 | 3509 | 7 | 29 | 21 | 2 | [ENSG00000235139, ENSG00000188883] | [ENSG00000156687, ENSG00000235139, ENSG0000022... | 0.737245 |
| 39 | 4.1 | subclass.full | Descending Thin Limb Cell Type 3 | 50 | 0.545977 | 0.612903 | 0.380000 | 3504 | 12 | 31 | 19 | 2 | [ENSG00000249776, ENSG00000145721] | [ENSG00000249776, ENSG00000233611, ENSG0000014... | 1.000000 |
| 40 | 4.1 | subclass.full | Descending Vasa Recta Endothelial Cell | 50 | 0.636364 | 0.933333 | 0.280000 | 3515 | 1 | 36 | 14 | 2 | [ENSG00000141469, ENSG00000116833] | [ENSG00000136960, ENSG00000141469, ENSG0000011... | 1.000000 |
| 41 | 4.1 | subclass.full | Distal Convoluted Tubule Cell Type 1 | 50 | 0.674603 | 0.894737 | 0.340000 | 3514 | 2 | 33 | 17 | 2 | [ENSG00000237422, ENSG00000119121] | [ENSG00000237422, ENSG00000140470, ENSG0000024... | 0.694848 |
| 42 | 4.1 | subclass.full | Distal Convoluted Tubule Cell Type 2 | 50 | 0.549451 | 0.606061 | 0.400000 | 3503 | 13 | 30 | 20 | 3 | [ENSG00000165973, ENSG00000070915, ENSG0000018... | [ENSG00000165973, ENSG00000146021, ENSG0000011... | 0.262543 |
| 43 | 4.1 | subclass.full | Fibroblast | 50 | 0.573770 | 0.777778 | 0.280000 | 3512 | 4 | 36 | 14 | 3 | [ENSG00000140092, ENSG00000112936, ENSG0000011... | [ENSG00000262655, ENSG00000154262, ENSG0000014... | 0.330656 |
| 44 | 4.1 | subclass.full | Glomerular Capillary Endothelial Cell | 50 | 0.746269 | 0.952381 | 0.400000 | 3515 | 1 | 30 | 20 | 2 | [ENSG00000167941, ENSG00000145708] | [ENSG00000167941, ENSG00000013016, ENSG0000014... | 1.000000 |
| 45 | 4.1 | subclass.full | Inner Medullary Collecting Duct Cell | 50 | 0.681818 | 0.807692 | 0.420000 | 3511 | 5 | 29 | 21 | 3 | [ENSG00000228624, ENSG00000249853, ENSG0000022... | [ENSG00000228624, ENSG00000249853, ENSG0000013... | 0.700192 |
| 46 | 4.1 | subclass.full | Intercalated Cell Type B | 50 | 0.774648 | 0.956522 | 0.440000 | 3515 | 1 | 28 | 22 | 2 | [ENSG00000091137, ENSG00000188175] | [ENSG00000091137, ENSG00000027644, ENSG0000012... | 0.909313 |
| 47 | 4.1 | subclass.full | Lymphatic Endothelial Cell | 50 | 0.891089 | 0.947368 | 0.720000 | 3514 | 2 | 14 | 36 | 1 | [ENSG00000138722] | [ENSG00000138722, ENSG00000184058, ENSG0000020... | 1.000000 |
| 48 | 4.1 | subclass.full | M2 Macrophage | 50 | 0.714286 | 0.947368 | 0.360000 | 3515 | 1 | 32 | 18 | 2 | [ENSG00000260314, ENSG00000137491] | [ENSG00000177575, ENSG00000260314, ENSG0000013... | 1.000000 |
| 49 | 4.1 | subclass.full | Macula Densa Cell | 50 | 0.655738 | 0.888889 | 0.320000 | 3514 | 2 | 34 | 16 | 2 | [ENSG00000139220, ENSG00000089250] | [ENSG00000116183, ENSG00000091128, ENSG0000013... | 0.919283 |
| 50 | 4.1 | subclass.full | Mast Cell | 50 | 0.890805 | 1.000000 | 0.620000 | 3516 | 0 | 19 | 31 | 1 | [ENSG00000185052] | [ENSG00000163751, ENSG00000197253, ENSG0000010... | 0.799457 |
| 51 | 4.1 | subclass.full | Medullary Fibroblast | 50 | 0.704225 | 0.869565 | 0.400000 | 3513 | 3 | 30 | 20 | 2 | [ENSG00000182256, ENSG00000259275] | [ENSG00000182256, ENSG00000079931, ENSG0000006... | 1.000000 |
| 52 | 4.1 | subclass.full | Medullary Thick Ascending Limb Cell | 50 | 0.634328 | 0.809524 | 0.340000 | 3512 | 4 | 33 | 17 | 3 | [ENSG00000159261, ENSG00000066230, ENSG0000011... | [ENSG00000159261, ENSG00000066230, ENSG0000028... | 1.000000 |
| 53 | 4.1 | subclass.full | Mesangial Cell | 50 | 0.710526 | 0.771429 | 0.540000 | 3508 | 8 | 23 | 27 | 2 | [ENSG00000188517, ENSG00000133816] | [ENSG00000144891, ENSG00000154864, ENSG0000013... | 0.532527 |
| 54 | 4.1 | subclass.full | Monocyte-derived Cell | 50 | 0.368852 | 0.500000 | 0.180000 | 3507 | 9 | 41 | 9 | 2 | [ENSG00000137462, ENSG00000136250] | [ENSG00000137462, ENSG00000119900, ENSG0000010... | 0.692946 |
| 55 | 4.1 | subclass.full | Myofibroblast | 50 | 0.636364 | 0.933333 | 0.280000 | 3515 | 1 | 36 | 14 | 2 | [ENSG00000198542, ENSG00000249669] | [ENSG00000198542, ENSG00000152402, ENSG0000010... | 0.669407 |
| 56 | 4.1 | subclass.full | Natural Killer Cell / Natural Killer T Cell | 50 | 0.779570 | 0.852941 | 0.580000 | 3511 | 5 | 21 | 29 | 2 | [ENSG00000105374, ENSG00000275302] | [ENSG00000180644, ENSG00000105374, ENSG0000011... | 1.000000 |
| 57 | 4.1 | subclass.full | Neutrophil | 50 | 0.816832 | 0.868421 | 0.660000 | 3511 | 5 | 17 | 33 | 1 | [ENSG00000143546] | [ENSG00000143546, ENSG00000163220, ENSG0000005... | 1.000000 |
| 58 | 4.1 | subclass.full | Non-classical Monocyte | 50 | 0.634328 | 0.809524 | 0.340000 | 3512 | 4 | 33 | 17 | 2 | [ENSG00000085265, ENSG00000028277] | [ENSG00000085265, ENSG00000204482, ENSG0000001... | 1.000000 |
| 59 | 4.1 | subclass.full | Outer Medullary Collecting Duct Intercalated C... | 50 | 0.775862 | 0.870968 | 0.540000 | 3512 | 4 | 23 | 27 | 2 | [ENSG00000253675, ENSG00000258881] | [ENSG00000253675, ENSG00000106302, ENSG0000023... | 1.000000 |
| 60 | 4.1 | subclass.full | Outer Medullary Collecting Duct Principal Cell | 50 | 0.652174 | 0.818182 | 0.360000 | 3512 | 4 | 32 | 18 | 2 | [ENSG00000254789, ENSG00000267659] | [ENSG00000254789, ENSG00000254695, ENSG0000016... | 0.710974 |
| 61 | 4.1 | subclass.full | Papillary Tip Epithelial Cell | 50 | 0.783582 | 1.000000 | 0.420000 | 3516 | 0 | 29 | 21 | 2 | [ENSG00000174226, ENSG00000142973] | [ENSG00000171401, ENSG00000174226, ENSG0000014... | 1.000000 |
| 62 | 4.1 | subclass.full | Parietal Epithelial Cell | 50 | 0.637255 | 1.000000 | 0.260000 | 3516 | 0 | 37 | 13 | 2 | [ENSG00000112414, ENSG00000000971] | [ENSG00000162692, ENSG00000112414, ENSG0000000... | 0.868456 |
| 63 | 4.1 | subclass.full | Peritubular Capilary Endothelial Cell | 50 | 0.584416 | 0.692308 | 0.360000 | 3508 | 8 | 32 | 18 | 2 | [ENSG00000163687, ENSG00000102755] | [ENSG00000163687, ENSG00000168497, ENSG0000012... | 0.592903 |
| 64 | 4.1 | subclass.full | Plasma Cell | 50 | 0.783582 | 1.000000 | 0.420000 | 3516 | 0 | 29 | 21 | 2 | [ENSG00000167077, ENSG00000183508] | [ENSG00000170476, ENSG00000167077, ENSG0000018... | 1.000000 |
| 65 | 4.1 | subclass.full | Plasmacytoid Dendritic Cell | 31 | 0.900901 | 1.000000 | 0.645161 | 3535 | 0 | 11 | 20 | 1 | [ENSG00000267337] | [ENSG00000267337, ENSG00000111249, ENSG0000019... | 1.000000 |
| 66 | 4.1 | subclass.full | Podocyte | 50 | 0.902062 | 0.972222 | 0.700000 | 3515 | 1 | 15 | 35 | 2 | [ENSG00000139304, ENSG00000178038] | [ENSG00000139304, ENSG00000155816, ENSG0000014... | 1.000000 |
| 67 | 4.1 | subclass.full | Proximal Tubule Epithelial Cell Segment 1 / Se... | 50 | 0.701754 | 1.000000 | 0.320000 | 3516 | 0 | 34 | 16 | 2 | [ENSG00000171759, ENSG00000171766] | [ENSG00000149452, ENSG00000250799, ENSG0000017... | 0.680535 |
| 68 | 4.1 | subclass.full | Proximal Tubule Epithelial Cell Segment 3 | 50 | 0.681818 | 1.000000 | 0.300000 | 3516 | 0 | 35 | 15 | 2 | [ENSG00000158865, ENSG00000156222] | [ENSG00000158865, ENSG00000154025, ENSG0000022... | 0.957749 |
| 69 | 4.1 | subclass.full | Renin-positive Juxtaglomerular Granular Cell | 50 | 0.954198 | 0.943396 | 1.000000 | 3513 | 3 | 0 | 50 | 1 | [ENSG00000143839] | [ENSG00000143839, ENSG00000152208, ENSG0000010... | 1.000000 |
| 70 | 4.1 | subclass.full | Schwann Cell / Neural | 35 | 0.952381 | 1.000000 | 0.800000 | 3531 | 0 | 7 | 28 | 1 | [ENSG00000179915] | [ENSG00000179915, ENSG00000175161, ENSG0000007... | 1.000000 |
| 71 | 4.1 | subclass.full | T Cell | 50 | 0.647059 | 0.733333 | 0.440000 | 3508 | 8 | 28 | 22 | 1 | [ENSG00000168685] | [ENSG00000168685, ENSG00000153283, ENSG0000017... | 1.000000 |
| 72 | 4.1 | subclass.full | Transitional Principal-Intercalated Cell | 50 | 0.714286 | 0.947368 | 0.360000 | 3515 | 1 | 32 | 18 | 3 | [ENSG00000160951, ENSG00000130222, ENSG0000025... | [ENSG00000160951, ENSG00000130222, ENSG0000025... | 0.311119 |
| 73 | 4.1 | subclass.full | Vascular Smooth Muscle Cell | 50 | 0.766667 | 0.920000 | 0.460000 | 3514 | 2 | 27 | 23 | 2 | [ENSG00000122367, ENSG00000156113] | [ENSG00000122367, ENSG00000156113, ENSG0000016... | 1.000000 |
| 74 | 4.1 | subclass.full | Vascular Smooth Muscle Cell / Pericyte | 50 | 0.500000 | 0.600000 | 0.300000 | 3506 | 10 | 35 | 15 | 2 | [ENSG00000113721, ENSG00000131711] | [ENSG00000138031, ENSG00000113721, ENSG0000013... | 0.277395 |
4. Plotting¶
load NS-Forest results (copy set up and load pkl)¶
In [28]:
# ## set up
# organ = "kidney" #<---
# author = "Lake" #<---
# year = "2023" #<---
# output_folder = "outputs_" + organ + "_" + author + "_" + year + "/" #e.g., "outputs_kidney_Lake_2023/"
# cluster_header = "subclass.full" #<---
# outputfilename_suffix = cluster_header
# outputfilename_prefix = cluster_header
# ## load NS-Forest results
# results = pd.read_pickle(output_folder + cluster_header + "_results.pkl")
In [29]:
## set results to plot
results_to_plot = results
boxplots¶
In [30]:
ns.pl.boxplot(results_to_plot, "f_score", save = "html", output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Saving... outputs_kidney_Lake_2023/subclass.full_boxplot_f_score.html
In [31]:
ns.pl.boxplot(results_to_plot, "precision", save = "html", output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Saving... outputs_kidney_Lake_2023/subclass.full_boxplot_precision.html
In [32]:
ns.pl.boxplot(results_to_plot, "recall", save = "html", output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Saving... outputs_kidney_Lake_2023/subclass.full_boxplot_recall.html
In [33]:
ns.pl.boxplot(results_to_plot, "onTarget", save = "html", output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Saving... outputs_kidney_Lake_2023/subclass.full_boxplot_onTarget.html
scatter plots w.r.t. cluster size¶
In [34]:
ns.pl.scatter_w_clusterSize(results, "f_score", save = True, output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Saving... outputs_kidney_Lake_2023/subclass.full_scatter_f_score.html
In [35]:
ns.pl.scatter_w_clusterSize(results, "precision", save = True, output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Saving... outputs_kidney_Lake_2023/subclass.full_scatter_precision.html
In [36]:
ns.pl.scatter_w_clusterSize(results, "recall", save = True, output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Saving... outputs_kidney_Lake_2023/subclass.full_scatter_recall.html
In [37]:
ns.pl.scatter_w_clusterSize(results, "onTarget", save = True, output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Saving... outputs_kidney_Lake_2023/subclass.full_scatter_onTarget.html
In [38]:
## Download gene mapping utilities from cell-kn (one-time)
import urllib.request
import os
if not os.path.exists('gene_mapping_utils.py'):
print("Downloading gene_mapping_utils.py from cell-kn...")
url = "https://raw.githubusercontent.com/NIH-NLM/cell-kn/main/utils/gene_mapping_utils.py"
urllib.request.urlretrieve(url, "gene_mapping_utils.py")
print("Downloaded!")
# %%
## Import and use
from gene_mapping_utils import load_gene_mapping, create_mapping_dict
# Load mapping (cached after first run)
gene_mapping = load_gene_mapping()
ensg_to_symbol = create_mapping_dict(gene_mapping)
print(f"Loaded {len(gene_mapping)} gene mappings")
# Map ENSG IDs to gene names (FAST)
results_to_plot['gene_names'] = [
[ensg_to_symbol.get(gene, gene) for gene in markers]
for markers in results_to_plot['NSForest_markers']
]
print(f"\nMapped markers for {len(results_to_plot)} clusters")
# Create markers_dict
markers_dict = dict(zip(results_to_plot["clusterName"],
results_to_plot["gene_names"]))
print(f"markers_dict created with {len(markers_dict)} clusters")
Loading gene mapping from gene_mapping.csv Loaded 34460 gene mappings Mapped markers for 75 clusters markers_dict created with 75 clusters
In [39]:
## Add gene symbol annotations to adata
# Add gene_symbol column to adata.var
adata.var['gene_symbol'] = [ensg_to_symbol.get(gene, gene) for gene in adata.var_names]
In [40]:
ns.pl.dotplot(adata,
markers_dict,
cluster_header,
dendrogram=True,
use_raw=False,
gene_symbols='gene_symbol', # Display symbols instead of ENSG
save="svg",
output_folder=output_folder,
outputfilename_suffix=outputfilename_prefix)
WARNING: saving figure to file outputs_kidney_Lake_2023/dotplot_subclass.full.svg
In [41]:
ns.pl.dotplot(adata,
markers_dict,
cluster_header,
dendrogram = True,
use_raw = False,
gene_symbols='gene_symbol', # Display symbols instead of ENSG
standard_scale = 'var',
save = "svg",
output_folder = output_folder,
outputfilename_suffix = outputfilename_prefix + "_scaled")
WARNING: saving figure to file outputs_kidney_Lake_2023/dotplot_subclass.full_scaled.svg
In [42]:
ns.pl.stackedviolin(adata,
markers_dict,
cluster_header,
dendrogram = True,
use_raw = False,
gene_symbols='gene_symbol', # Display symbols instead of ENSG
save = "svg",
output_folder = output_folder,
outputfilename_suffix = outputfilename_prefix)
WARNING: saving figure to file outputs_kidney_Lake_2023/stacked_violin_subclass.full.svg
In [ ]:
ns.pl.stackedviolin(adata,
markers_dict,
cluster_header,
dendrogram = True,
use_raw = False,
gene_symbols='gene_symbol', # Display symbols instead of ENSG
standard_scale = 'var',
save = "svg",
output_folder = output_folder,
outputfilename_suffix = outputfilename_prefix + "_scaled")
In [ ]:
ns.pl.matrixplot(adata,
markers_dict,
cluster_header,
dendrogram = True,
use_raw = False,
gene_symbols='gene_symbol', # Display symbols instead of ENSG
save = "svg",
output_folder = output_folder,
outputfilename_suffix = outputfilename_prefix)
In [ ]:
ns.pl.matrixplot(adata,
markers_dict,
cluster_header,
dendrogram = True,
use_raw = False,
gene_symbols='gene_symbol', # Display symbols instead of ENSG
standard_scale = 'var',
save = "svg",
output_folder = output_folder,
outputfilename_suffix = outputfilename_prefix + "_scaled")
Save¶
count saved items in the output folder¶
In [ ]:
from pathlib import Path
folder_path = Path(output_folder)
item_count = len(list(folder_path.iterdir()))
print(f"Total items in the output folder: {item_count}")
saving html report¶
In [ ]:
## for interactive plot
import plotly
plotly.offline.init_notebook_mode()
In [ ]:
## save html report
!jupyter nbconvert --to html DEMO_NS-Forest_workflow.ipynb #<---